import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import shapiro
import plotly.express as px
from scipy.stats import skew
from sklearn.preprocessing import LabelEncoder
from scipy.stats import mannwhitneyu
from scipy.stats import chisquare
from scipy.stats import f_oneway
df=pd.read_csv("D:/ML/insurance.csv")
df.head()
rows,cols=df.shape
print("The number of rows is {0} and the number of columns is {1}".format(rows, cols))
df.info()
print(df.isnull().sum())
print("There are no missing values in the dataframe")
df.describe()
if(skew(df['bmi'])>0):
print("The Distribution is Right skewed with a value of {}".format(skew(df['bmi'])))
elif(skew(df['bmi'])<0):
print("The Distribution is Left skewed with a value of {}".format(skew(df['bmi'])))
else:
print("The Distribution is Normal")
fig = px.histogram(df, x="bmi",marginal="box",hover_data=df.columns,)
fig.show()
if(skew(df['age'])>0):
print("The Distribution is Right skewed with a value of {}".format(skew(df['age'])))
elif(skew(df['age'])<0):
print("The Distribution is Left skewed with a value of {}".format(skew(df['age'])))
else:
print("The Distribution is Normal")
fig = px.histogram(df, x="age",marginal="box",hover_data=df.columns,)
fig.show()
if(skew(df['charges'])>0):
print("The Distribution is Right skewed with a value of {}".format(skew(df['charges'])))
elif(skew(df['charges'])<0):
print("The Distribution is Left skewed with a value of {}".format(skew(df['charges'])))
else:
print("The Distribution is Normal")
fig = px.histogram(df, x="charges",marginal="box",hover_data=df.columns,)
fig.show()
print(sns.countplot( x="children", data=df))
print(sns.countplot( x="smoker", data=df))
print(sns.countplot( x="region", data=df))
print(sns.countplot( x="sex", data=df))
col=["sex","smoker","region","children"]
df_copy=df.loc[:,col]
px.parallel_categories(df_copy,color_continuous_scale=["red","yellow","green"])
labelencoder = LabelEncoder()
df['sex_transform'] = labelencoder.fit_transform(df['sex'])
df['region_transform'] = labelencoder.fit_transform(df['region'])
df['smoker_transform'] = labelencoder.fit_transform(df['smoker'])
sns.pairplot(df)
gender=df.groupby(["smoker"])
smoker=gender.get_group("yes")["charges"]
non_smoker=gender.get_group("no")["charges"]
sns.distplot(smoker)
sns.distplot(non_smoker)
print(shapiro(smoker))
print(shapiro(non_smoker))
print("As statically proven that both the distributions are not gaussian , we use the non-parametric testing statergy")
stat, p = mannwhitneyu(smoker,non_smoker)
print('stat={}, p={}' .format(stat, p))
if p > 0.05:
print('Both the distributions are same, there is no difference in charges for smokers and non smokers')
else:
print('Both the distributions are different, there is a significant difference in charges for smokers and non smokers')
print(shapiro(male_bmi))
print(shapiro(female_bmi))
gender=df.groupby(["sex"])
male_bmi=gender.get_group("male")["bmi"]
female_bmi=gender.get_group("female")["bmi"]
sns.distplot(male_bmi)
sns.distplot(female_bmi)
print("As statically proven that both the distributions are not gaussian , we use the non-parametric testing statergy")
stat, p = mannwhitneyu(male_bmi,female_bmi)
print('stat={}, p={}' .format(stat, p))
if p > 0.05:
print('Both the distributions are same, there is no difference in BMI between Male and Female')
else:
print('Both the distributions are different, there is a significant difference in in BMI between Male and Female')
smoker_gender=pd.crosstab(df['sex'],df['smoker'])
stat, p = chisquare(smoker_gender['yes'])
print('stat={}, p={}' .format(stat, p))
if p > 0.05:
print('There is no difference in propotion of smokers between Male and Female')
else:
print('There is a difference in propotion of smokers between Male and Female')
bmi=df.groupby("children")
bmi_0=bmi.get_group(0)["bmi"]
bmi_1=bmi.get_group(1)["bmi"]
bmi_2=bmi.get_group(2)["bmi"]
sns.distplot(bmi_0)
sns.distplot(bmi_1)
sns.distplot(bmi_2)
stat, p = f_oneway(bmi_0, bmi_1, bmi_2)
print('stat={}, p={}'.format(stat, p))
if p > 0.05:
print('There is no significant difference in bmi across women with no children, one child and two children')
else:
print('There is significant difference in bmi across women with no children, one child and two children')